The objective of this markdown is to give an overview of the big dataset COVID19_ALL.h5ad as well as the smaller “pilot” we have created. For detail on how the smaller dataset was created see Clean_data_subsetting.Rmd.

The data objects are AnnData objects. If you are confused about the structure see this.

Setup

library(tidyverse)
library(anndata)
setwd("~/sc_covid_PiB2023/data_science_project/Code") # changing the working directory
pilot <- read_h5ad('../Data/Pilot_2_rule_them_ALL.h5ad')
# Loading dthe large dataset
All_data <- read_h5ad("../../data/sc_covid/COVID19_ALL.h5ad") 
# be patient it takes about 15 min to load with 128 G of RAM available

We’ll save the annotations in a dataframe to make it easier to work with ggplot:

pilot_obs <- data.frame(pilot$obs)
All_data_obs <- data.frame(All_data$obs)

Dimensions

This is just to give an overview of the size of the expression matrix for both. It’s \(cell \times genes\):

All_data:

c(All_data$n_obs, All_data$n_vars)
## [1] 1462702   27943

1,462,702 cells and 27,943 genes

Pilot:

dim(pilot$X)
## [1]  5694 23382

5,694 cells and 23,382 genes.

Annotations

Overview of the annotations in both

All_data:

colnames(All_data_obs)
##  [1] "celltype"                                       
##  [2] "majorType"                                      
##  [3] "sampleID"                                       
##  [4] "PatientID"                                      
##  [5] "datasets"                                       
##  [6] "City"                                           
##  [7] "Age"                                            
##  [8] "Sex"                                            
##  [9] "Sample.type"                                    
## [10] "CoVID.19.severity"                              
## [11] "Sample.time"                                    
## [12] "Sampling.day..Days.after.symptom.onset."        
## [13] "SARS.CoV.2"                                     
## [14] "Single.cell.sequencing.platform"                
## [15] "BCR.single.cell.sequencing"                     
## [16] "TCR.single.cell.sequencing"                     
## [17] "Outcome"                                        
## [18] "Comorbidities"                                  
## [19] "COVID.19.related.medication.and.anti.microbials"
## [20] "Leukocytes..G.L."                               
## [21] "Neutrophils..G.L."                              
## [22] "Lymphocytes..G.L."                              
## [23] "Unpublished"

Pilot:

colnames(pilot_obs)
## [1] "cellType"          "viralLoad"         "PatientID"        
## [4] "sampleID"          "Is_infected"       "Age"              
## [7] "Sex"               "CoVID.19.severity"

Celltype / major type

All_data:

There are a lot of different celltypes in All_data

length(unique(All_data$obs$celltype))
## [1] 64

So it will not yeild a pretty plot. But we’ll try anyway:

ggplot(All_data_obs, aes(celltype, fill = celltype))+
  geom_bar(stat = "count" ) + 
  theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1, size = 6),
        legend.text = element_text(size = 8),  # Adjust the size parameter for legend text
        legend.title = element_text(size = 10),  # Adjust the size parameter for the legend title
        legend.key.size = unit(0.8, "cm"), 
        legend.position="bottom")

The annotation majorType is much closer to cellType in the pilot.

ggplot(All_data_obs, aes(majorType, fill = majorType))+
  geom_bar(stat = "count" ) + 
  geom_text(stat = "count", aes(label = after_stat(count)), vjust = -0.5, size = 3) +
  theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1))

Pilot:

ggplot(pilot_obs, aes(cellType, fill = cellType))+
  geom_bar(stat = "count" ) + 
  geom_text(stat = "count", aes(label = after_stat(count)), vjust = +1.5) +
  theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1))

Gender

We’ll also have to make a different dataframe such that everything related to the patient does not repeat over multiple rows

# for All_data:
All_data_df <- All_data_obs %>% 
                select(c("PatientID","City","Age","Sex","CoVID.19.severity")) %>% 
                unique()

# for the pilot:
pilot_df <- pilot_obs %>% 
        select(c("PatientID", "Age", "Sex", "CoVID.19.severity")) %>% 
        unique()

They made a mistake when creating the large dataset:

All_data_df[All_data_df$PatientID=="P-M026",]
##                      PatientID      City Age Sex CoVID.19.severity
## AAACCTGAGGATATAC-136    P-M026 Chongqing  32   F     mild/moderate
## AAACCTGAGGTGCACA-140    P-M026 Chongqing  62   F     mild/moderate

Somehow this woman with patient id P-M026 is simultaniously 32 and 62 years old

All_data:

ggplot(All_data_df, aes(Sex, fill = Sex))+
  geom_bar(stat = "count" ) + 
  geom_text(stat = "count", aes(label = after_stat(count)), vjust = -0.5)

Pilot:

ggplot(pilot_df, aes(Sex, fill = Sex))+
  geom_bar(stat = "count" ) + 
  geom_text(stat = "count", aes(label = after_stat(count)), vjust = +2)

Age

All_data:

All_ages <- as.numeric(as.character(All_data_df$Age[All_data_df$Age != "unknown"]))
summary(All_ages)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    6.00   35.00   49.00   49.22   62.00   92.00
as.data.frame(All_ages) %>% 
  ggplot(aes(All_ages))+
  xlab("Age")+
  geom_bar()

Pilot

pilot_ages <- as.numeric(as.character(pilot_df$Age))
summary(pilot_ages)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   46.00   56.50   61.50   60.25   63.75   72.00
as.data.frame(pilot_ages) %>% 
ggplot(aes(pilot_ages))+
  geom_bar(stat = "count" ) + 
  xlab("Age")

Severity

All_data:

ggplot(All_data_df, aes(CoVID.19.severity, fill = CoVID.19.severity))+
  geom_bar(stat = "count" ) + 
  geom_text(stat = "count", aes(label = after_stat(count)), vjust = 2)

Pilot:

unique(pilot_df$CoVID.19.severity)
## [1] severe/critical
## Levels: severe/critical

Genes

We have some potentially interesting genes from Clean_PCA.Rmd that were responsible for most of the varience; MALAT1, B2M, MT-CO2, FTL, MT-CO1.

Furthermore it might be fun to look at the highest expressed genes (on average) as well as the genes that has the largest variance (we only do this for the pilot since the large dataset would take wayyyyy too long). We first find the most expressed genes and the genes with the most variance:

AvExpression <- apply(pilot$X, 2, mean) # Compute average gene expression across cell types
varExpression <- apply(pilot$X, 2, var) # Compute variance of expression of genes

M = 3 # number of most expressed cells we want

Av_genes <- names(AvExpression[order(AvExpression, decreasing = T)][1:M]) # Find M most expressed genes
var_genes <- names(varExpression[order(varExpression, decreasing = T)][1:M]) # Find M most expressed genes(or varience)

# Extract the expression matrix for the given genes as a dense matrix
matrix_av <- as.data.frame(as.matrix(pilot[, (Av_genes)]$X)) %>% cbind(pilot$obs)
matrix_var <- as.data.frame(as.matrix(pilot[, (var_genes)]$X))  %>% cbind(pilot$obs)

Most expressed:

Av_genes
## [1] "MALAT1" "B2M"    "MT-CO2"

Boxplot showing the expression of a specified gene for all cells of each cell type (colored by viral load).

# "MALAT1"
matrix_av %>% 
  pivot_longer(cols = "MALAT1") %>%
  ggplot(aes(x = cellType, y = value, col = viralLoad))+
  scale_color_gradient(low='blue', high = 'red')+
  geom_boxplot() + 
  geom_jitter(alpha = 0.35, size = 0.4) + 
  facet_wrap(~name)+
  theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1), 
        legend.position = "right") + 
  labs(y = "Expression Count (Normalized)", x = "Cell Type")

# "B2M"
matrix_av %>% 
  pivot_longer(cols = "B2M") %>%
  ggplot(aes(x = cellType, y = value, col = viralLoad))+
  scale_color_gradient(low='blue', high = 'red')+
  geom_boxplot() + 
  geom_jitter(alpha = 0.35, size = 0.4) + 
  facet_wrap(~name)+
  theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1), 
        legend.position = "right") + 
  labs(y = "Expression Count (Normalized)", x = "Cell Type")

# "MT-CO2"
matrix_av %>% 
  pivot_longer(cols = "MT-CO2") %>%
  ggplot(aes(x = cellType, y = value, col = viralLoad))+
  scale_color_gradient(low='blue', high = 'red')+
  geom_boxplot() + 
  geom_jitter(alpha = 0.35, size = 0.4) + 
  facet_wrap(~name)+
  theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1), 
        legend.position = "right") + 
  labs(y = "Expression Count (Normalized)", x = "Cell Type")

Most variance:

The same is done for the genes that have the most variance:

var_genes
## [1] "CCL2"   "SLPI"   "S100A8"
#  "CCL2"
matrix_var %>% 
  pivot_longer(cols = "CCL2") %>%
  ggplot(aes(x = cellType, y = value, col = viralLoad))+
  scale_color_gradient(low='blue', high = 'red')+
  geom_boxplot() + 
  geom_jitter(alpha = 0.35, size = 0.4) + 
  facet_wrap(~name)+
  theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1), 
        legend.position = "right") + 
  labs(y = "Expression Count (Normalized)", x = "Cell Type")

# "SLPI"
matrix_var %>% 
  pivot_longer(cols = "SLPI") %>%
  ggplot(aes(x = cellType, y = value, col = viralLoad))+
  scale_color_gradient(low='blue', high = 'red')+
  geom_boxplot() + 
  geom_jitter(alpha = 0.35, size = 0.4) + 
  facet_wrap(~name)+
  theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1), 
        legend.position = "right") + 
  labs(y = "Expression Count (Normalized)", x = "Cell Type")

# "S100A8"
matrix_var %>% 
  pivot_longer(cols = "S100A8") %>%
  ggplot(aes(x = cellType, y = value, col = viralLoad))+
  scale_color_gradient(low='blue', high = 'red')+
  geom_boxplot() + 
  geom_jitter(alpha = 0.35, size = 0.4) + 
  facet_wrap(~name)+
  theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1), 
        legend.position = "right") + 
  labs(y = "Expression Count (Normalized)", x = "Cell Type")

Heatmap

A heatmap showing the expression of the \(M\) most expressed genes (on average) for \(M\) randomly sampled cells:

M = 10 # amount of genes and cells

AvExpression <- apply(pilot$X, 2, mean)
MostExpressed <- names(AvExpression[order(AvExpression, decreasing = T)][1:M]) # Selecting the most expressed genes

genes <- MostExpressed # Insert genes, you wish to heat map
samples <- sample(1:5694, M)
heatMapData <- as.data.frame(as.matrix(pilot[(samples), (genes)]$X)) # M x genes data frame is selected

heatMapData %>% 
  rownames_to_column() %>% 
  gather(colname, value, -rowname) %>% 
  ggplot(aes(y = rowname, x = colname, fill = value)) + 
  geom_tile() + 
  scale_fill_gradient(low = "yellow", high = "purple") +
  theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1)) + 
  labs(title = "Heatmap of Selected Gene Expression Values", subtitle = "For Random Sample") + 
  xlab("Genes") + 
  ylab("Sample ID")